#importing the basic libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
CONTEXT: Medical research university X is undergoing a deep research on patients with certain conditions.University has an internal AI team. Due to confidentiality the patient’s details and the conditions are masked by the client by providing different datasets to the AI team for developing a AIML model which can predict the condition of the patient depending on the received test results.
DATA DESCRIPTION: The data consists of biomechanics features of the patients according to their current conditions. Each patient is represented in the data set by six biomechanics attributes derived from the shape and orientation of the condition to their body part.
1. P_incidence
2. P_tilt
3. L_angle
4. S_slope
5. P_radius
6. S_degree
7. Class
PROJECT OBJECTIVE: Demonstrate the ability to fetch, process and leverage data to generate useful predictions
by training Supervised Learning algorithms.
1. Import and warehouse data:
• Import all the given datasets and explore shape and size of each.
• Merge all datasets onto one and explore final shape and size.
Import the datasets
part1a = pd.read_csv("Part1 - Normal.csv")
part1b = pd.read_csv("Part1 - Type_H.csv")
part1c = pd.read_csv("Part1 - Type_S.csv")
Shape of data
part1a.shape,part1b.shape, part1c.shape
((100, 7), (60, 7), (150, 7))
print ("The first dataset, which has class Normal has",'\033[1m', part1a.shape[0],"rows and", part1a.shape[1]," columns",'\033[0m')
print ("The second dataset ,which has class Type H, has",'\033[1m', part1b.shape[0],"rows and", part1b.shape[1]," columns",'\033[0m')
print ("The third dataset ,which has class Type S, has",'\033[1m', part1c.shape[0],"rows and", part1c.shape[1]," columns",'\033[0m')
The first dataset, which has class Normal has 100 rows and 7 columns The second dataset ,which has class Type H, has 60 rows and 7 columns The third dataset ,which has class Type S, has 150 rows and 7 columns
Size of dataset
part1a.size , part1b.size , part1c.size
(700, 420, 1050)
print ("The first dataset has",part1a.size,"elements")
print ("The second dataset has",part1b.size,"elements")
print ("The third dataset has",part1c.size,"elements")
The first dataset has 700 elements The second dataset has 420 elements The third dataset has 1050 elements
Checking the head and tail of the three separate datasets
part1a.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 | Normal |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 | Normal |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 | Normal |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 | Normal |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 | Normal |
part1b.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 63.027817 | 22.552586 | 39.609117 | 40.475232 | 98.672917 | -0.254400 | Type_H |
| 1 | 39.056951 | 10.060991 | 25.015378 | 28.995960 | 114.405425 | 4.564259 | Type_H |
| 2 | 68.832021 | 22.218482 | 50.092194 | 46.613539 | 105.985135 | -3.530317 | Type_H |
| 3 | 69.297008 | 24.652878 | 44.311238 | 44.644130 | 101.868495 | 11.211523 | Type_H |
| 4 | 49.712859 | 9.652075 | 28.317406 | 40.060784 | 108.168725 | 7.918501 | Type_H |
part1c.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 74.377678 | 32.053104 | 78.772013 | 42.324573 | 143.560690 | 56.125906 | Type_S |
| 1 | 89.680567 | 32.704435 | 83.130732 | 56.976132 | 129.955476 | 92.027277 | Type_S |
| 2 | 44.529051 | 9.433234 | 52.000000 | 35.095817 | 134.711772 | 29.106575 | Type_S |
| 3 | 77.690577 | 21.380645 | 64.429442 | 56.309932 | 114.818751 | 26.931841 | Type_S |
| 4 | 76.147212 | 21.936186 | 82.961502 | 54.211027 | 123.932010 | 10.431972 | Type_S |
part1a.tail()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 95 | 47.903565 | 13.616688 | 36.000000 | 34.286877 | 117.449062 | -4.245395 | Normal |
| 96 | 53.936748 | 20.721496 | 29.220534 | 33.215251 | 114.365845 | -0.421010 | Normal |
| 97 | 61.446597 | 22.694968 | 46.170347 | 38.751628 | 125.670725 | -2.707880 | Normal |
| 98 | 45.252792 | 8.693157 | 41.583126 | 36.559635 | 118.545842 | 0.214750 | Normal |
| 99 | 33.841641 | 5.073991 | 36.641233 | 28.767649 | 123.945244 | -0.199249 | Normal |
part1b.tail()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 55 | 47.657730 | 13.277385 | 36.679985 | 34.380345 | 98.249781 | 6.273012 | Type_H |
| 56 | 43.349606 | 7.467469 | 28.065483 | 35.882137 | 112.776187 | 5.753277 | Type_H |
| 57 | 46.855781 | 15.351514 | 38.000000 | 31.504267 | 116.250917 | 1.662706 | Type_H |
| 58 | 43.203185 | 19.663146 | 35.000000 | 23.540039 | 124.846109 | -2.919076 | Type_H |
| 59 | 48.109236 | 14.930725 | 35.564683 | 33.178512 | 124.056452 | 7.947905 | Type_H |
part1c.tail()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 145 | 80.111572 | 33.942432 | 85.101608 | 46.169139 | 125.593624 | 100.292107 | Type_S |
| 146 | 95.480229 | 46.550053 | 59.000000 | 48.930176 | 96.683903 | 77.283072 | Type_S |
| 147 | 74.094731 | 18.823727 | 76.032156 | 55.271004 | 128.405731 | 73.388216 | Type_S |
| 148 | 87.679087 | 20.365613 | 93.822416 | 67.313473 | 120.944829 | 76.730629 | Type_S |
| 149 | 48.259920 | 16.417462 | 36.329137 | 31.842457 | 94.882336 | 28.343799 | Type_S |
Merging the three datasets into a combined dataset
part1 = part1a.append([part1b,part1c])
part1.shape, part1.size
((310, 7), 2170)
print ('\033[1m',"The final dataset has", part1.shape[0],"rows and", part1.shape[1]," columns",'\033[0m','and',part1.size,'elements.')
The final dataset has 310 rows and 7 columns and 2170 elements.
part1.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | Class | |
|---|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 | Normal |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 | Normal |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 | Normal |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 | Normal |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 | Normal |
2. Data cleansing:
• Explore and if required correct the datatypes of each attribute
• Explore for null values in the attributes and if required drop or impute values
Information about the datatypes.
part1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 310 entries, 0 to 149 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null object dtypes: float64(6), object(1) memory usage: 19.4+ KB
Class has dataType as object we will have to change that.
part1['Class'].value_counts()
Type_S 133 Normal 73 Type_H 37 Nrmal 27 type_h 23 tp_s 17 Name: Class, dtype: int64
As we can see the 'Class' column has a typographical error (typo) for Type_S, Type_H, Normal as tp_s , type_h , Nrmal respectively. We will correct that.
part1.loc[part1['Class']=='tp_s','Class']='Type_S'
part1.loc[part1['Class']=='Nrmal','Class']='Normal'
part1.loc[part1['Class']=='type_h','Class']='Type_H'
part1['Class'].value_counts()
Type_S 150 Normal 100 Type_H 60 Name: Class, dtype: int64
part1['Class']=part1['Class'].astype('category') #changing to category datatype for Class column
part1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 310 entries, 0 to 149 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 P_incidence 310 non-null float64 1 P_tilt 310 non-null float64 2 L_angle 310 non-null float64 3 S_slope 310 non-null float64 4 P_radius 310 non-null float64 5 S_Degree 310 non-null float64 6 Class 310 non-null category dtypes: category(1), float64(6) memory usage: 17.4 KB
Checking for null values
part1.isnull().sum()
P_incidence 0 P_tilt 0 L_angle 0 S_slope 0 P_radius 0 S_Degree 0 Class 0 dtype: int64
There are no null values in the dataset
3. Data analysis & visualisation:
• Perform detailed statistical analysis on the data.
• Perform a detailed univariate, bivariate and multivariate analysis with appropriate detailed comments after each
analysis.
part1.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| P_incidence | 310.0 | 60.496653 | 17.236520 | 26.147921 | 46.430294 | 58.691038 | 72.877696 | 129.834041 |
| P_tilt | 310.0 | 17.542822 | 10.008330 | -6.554948 | 10.667069 | 16.357689 | 22.120395 | 49.431864 |
| L_angle | 310.0 | 51.930930 | 18.554064 | 14.000000 | 37.000000 | 49.562398 | 63.000000 | 125.742385 |
| S_slope | 310.0 | 42.953831 | 13.423102 | 13.366931 | 33.347122 | 42.404912 | 52.695888 | 121.429566 |
| P_radius | 310.0 | 117.920655 | 13.317377 | 70.082575 | 110.709196 | 118.268178 | 125.467674 | 163.071041 |
| S_Degree | 310.0 | 26.296694 | 37.559027 | -11.058179 | 1.603727 | 11.767934 | 41.287352 | 418.543082 |
part1.describe(include='all').transpose()
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| P_incidence | 310.0 | NaN | NaN | NaN | 60.496653 | 17.23652 | 26.147921 | 46.430294 | 58.691038 | 72.877696 | 129.834041 |
| P_tilt | 310.0 | NaN | NaN | NaN | 17.542822 | 10.00833 | -6.554948 | 10.667069 | 16.357689 | 22.120395 | 49.431864 |
| L_angle | 310.0 | NaN | NaN | NaN | 51.93093 | 18.554064 | 14.0 | 37.0 | 49.562398 | 63.0 | 125.742385 |
| S_slope | 310.0 | NaN | NaN | NaN | 42.953831 | 13.423102 | 13.366931 | 33.347122 | 42.404912 | 52.695888 | 121.429566 |
| P_radius | 310.0 | NaN | NaN | NaN | 117.920655 | 13.317377 | 70.082575 | 110.709196 | 118.268178 | 125.467674 | 163.071041 |
| S_Degree | 310.0 | NaN | NaN | NaN | 26.296694 | 37.559027 | -11.058179 | 1.603727 | 11.767934 | 41.287352 | 418.543082 |
| Class | 310 | 3 | Type_S | 150 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
print(part1.max() - part1.min())
P_incidence 103.686119 P_tilt 55.986812 L_angle 111.742385 S_slope 108.062635 P_radius 92.988466 S_Degree 429.601261 dtype: float64
Checking the variance of all Columns.
part1.var()
P_incidence 297.097633 P_tilt 100.166675 L_angle 344.253290 S_slope 180.179672 P_radius 177.352531 S_Degree 1410.680476 dtype: float64
To measure the skeweness of every attribute
part1.skew()
P_incidence 0.520440 P_tilt 0.676553 L_angle 0.599451 S_slope 0.792577 P_radius -0.176835 S_Degree 4.317954 dtype: float64
h = np.asarray(part1['S_Degree'])
h = sorted(h)
plt.hist(h,bins = 10)
plt.show()
As we can see that there is high skewness in the S_Degree Column.
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['P_incidence'], ax=axes[0],color='Green')
sns.violinplot(x = 'P_incidence', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'P_incidence', data=part1 , ax=axes[2],color='Green')
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking count of outliers
q25,q75=np.percentile(part1['P_incidence'],25),np.percentile(part1['P_incidence'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['P_incidence'] if i < lower or i > upper]
print('Total Number of outliers in P_incidence:',len(Outliers))
print(Outliers)
Total Number of outliers in P_incidence: 3 [129.8340406, 118.1446548, 115.9232606]
Normality is maintained in the P_incidence column and there are 3 outliers.
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['P_tilt'], ax=axes[0],color='Green')
sns.violinplot(x = 'P_tilt', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'P_tilt', data=part1 , ax=axes[2],color='Green')
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking count of outliers
q25,q75=np.percentile(part1['P_tilt'],25),np.percentile(part1['P_tilt'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['P_tilt'] if i < lower or i > upper]
print('Total Number of outliers in P_tilt:',len(Outliers))
print(Outliers)
Total Number of outliers in P_tilt: 13 [41.55733141, 41.28630543, 39.82272448, 42.39620445, -6.554948347, 48.06953097, 39.84466878, 48.90365265, 42.68919513, 49.4318636, 40.30376567, 41.93368293, 46.55005318]
We can see that the P_Tilt has slightly right skewed data and there are negative as well as positive outliers.
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['L_angle'], ax=axes[0],color='Green')
sns.violinplot(x = 'L_angle', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'L_angle', data=part1 , ax=axes[2],color='Green')
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking count of Outliers
q25,q75=np.percentile(part1['L_angle'],25),np.percentile(part1['L_angle'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['L_angle'] if i < lower or i > upper]
print('Total Number of outliers in L_angle:',len(Outliers))
print(Outliers)
Total Number of outliers in L_angle: 1 [125.7423855]
We can see slight right skewness due to 1 outlier.
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['S_slope'], ax=axes[0],color='Green')
sns.violinplot(x = 'S_slope', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'S_slope', data=part1 , ax=axes[2],color='Green')
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking Count of outliers
q25,q75=np.percentile(part1['S_slope'],25),np.percentile(part1['S_slope'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['S_slope'] if i < lower or i > upper]
print('Total Number of outliers in S_slope:',len(Outliers))
print(Outliers)
Total Number of outliers in S_slope: 1 [121.4295656]
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['P_radius'], ax=axes[0],color='Green')
sns.violinplot(x = 'P_radius', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'P_radius', data=part1 , ax=axes[2],color='Green')
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking for outliers in P_radius.
q25,q75=np.percentile(part1['P_radius'],25),np.percentile(part1['P_radius'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['P_radius'] if i < lower or i > upper]
print('Total Number of outliers in P_radius:',len(Outliers))
print(Outliers)
Total Number of outliers in P_radius: 11 [147.8946372, 84.24141517, 148.5255624, 151.8398566, 163.0710405, 82.45603817, 81.0245406, 70.08257486, 78.99945411, 157.848799, 88.43424213]
f, axes = plt.subplots(1, 3, figsize=(15,5))
sns.distplot(part1['S_Degree'], ax=axes[0],color='Green')
sns.violinplot(x = 'S_Degree', data=part1, ax=axes[1],color='red')
sns.boxplot(x = 'S_Degree', data=part1 , ax=axes[2],color='Green', orient="h")
axes[0].set_title('Dist Plot')
axes[1].set_title('Violin plot')
axes[2].set_title('Box plot')
plt.show()
Checking for outliers in S_Degree.
q25,q75=np.percentile(part1['S_Degree'],25),np.percentile(part1['S_Degree'],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part1['S_Degree'] if i < lower or i > upper]
print('Total Number of outliers in S_Degree:',len(Outliers))
print(Outliers)
Total Number of outliers in S_Degree: 10 [101.7190919, 145.3781432, 110.8607824, 148.7537109, 418.5430821, 118.3533701, 104.8592474, 124.9844057, 117.3146829, 101.2187828]
f,axes=plt.subplots(1,2,figsize=(15,7))
part1['Class'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Class',data=part1,ax=axes[1])
axes[0].set_title('Pie Chart')
axes[1].set_title('Bar Graph')
plt.show()
As we can see, Type_S has the maximum ie 48.4% of the entire dataset.
Class and P_incidence Swarm plot Box plot Bar plot between Class and P_incidence attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='P_incidence', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='P_incidence', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='P_incidence', data = part1,ax=axes[2])
plt.show()
<Figure size 1080x504 with 0 Axes>
Class and P_tilt Swarm plot, Box plot and Point plot between Class and P_tilt attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='P_tilt', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='P_tilt', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='P_tilt', data = part1,ax=axes[2],color = 'red')
plt.show()
<Figure size 1080x504 with 0 Axes>
Class and L_angle Swarm plot Box plot Point plot between Class and L_angle attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='L_angle', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='L_angle', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='L_angle', data = part1,ax=axes[2], join=False)
plt.show()
<Figure size 1080x504 with 0 Axes>
Class and S_slope Swarm plot Box plot, Point plot between Class and S_slope attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='S_slope', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='S_slope', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='S_slope', data = part1,ax=axes[2])
plt.show()
<Figure size 1080x504 with 0 Axes>
Class and P_radius Swarm plot Box plot, Point plot between Class and P_radius attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='P_radius', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='P_radius', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='P_radius', data = part1,ax=axes[2])
plt.show()
<Figure size 1080x504 with 0 Axes>
Class and P_radius Swarm plot Box plot, Point plot between Class and S_Degree attribute.
f, axes = plt.subplots(1, 3, figsize=(15,5))
plt.figure(figsize=(15,7))
sns.swarmplot(x='Class', y='S_Degree', data = part1 ,ax=axes[0])
sns.boxplot(x='Class', y='S_Degree', data = part1 ,ax=axes[1])
sns.pointplot(x='Class', y='S_Degree', data = part1,ax=axes[2])
plt.show()
<Figure size 1080x504 with 0 Axes>
sns.pairplot(part1)
<seaborn.axisgrid.PairGrid at 0x25bded548e0>
sns.pairplot(part1,hue='Class')
<seaborn.axisgrid.PairGrid at 0x25be0145610>
class_summary=part1.groupby('Class')
class_summary.mean().reset_index()
| Class | P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|---|
| 0 | Normal | 51.685244 | 12.821414 | 43.542605 | 38.863830 | 123.890834 | 2.186572 |
| 1 | Type_H | 47.638407 | 17.398795 | 35.463524 | 30.239612 | 116.474968 | 2.480251 |
| 2 | Type_S | 71.514224 | 20.748038 | 64.110108 | 50.766186 | 114.518810 | 51.896687 |
we can see that Type_S contains higher values.
part1.corr()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| P_incidence | 1.000000 | 0.629199 | 0.717282 | 0.814960 | -0.247467 | 0.638743 |
| P_tilt | 0.629199 | 1.000000 | 0.432764 | 0.062345 | 0.032668 | 0.397862 |
| L_angle | 0.717282 | 0.432764 | 1.000000 | 0.598387 | -0.080344 | 0.533667 |
| S_slope | 0.814960 | 0.062345 | 0.598387 | 1.000000 | -0.342128 | 0.523557 |
| P_radius | -0.247467 | 0.032668 | -0.080344 | -0.342128 | 1.000000 | -0.026065 |
| S_Degree | 0.638743 | 0.397862 | 0.533667 | 0.523557 | -0.026065 | 1.000000 |
sns.heatmap(part1.corr(), annot=True,linewidths=.5)
<AxesSubplot:>
4. Data pre-processing:
• Segregate predictors vs target attributes
• Perform normalisation or scaling if required.
• Check for target balancing. Add your comments.
• Perform train-test split.
Segregating Predictor and Target attributes.
X= part1.drop(['Class'], axis = 1) #Predictor attributes
X.head()
| P_incidence | P_tilt | L_angle | S_slope | P_radius | S_Degree | |
|---|---|---|---|---|---|---|
| 0 | 38.505273 | 16.964297 | 35.112814 | 21.540976 | 127.632875 | 7.986683 |
| 1 | 54.920858 | 18.968430 | 51.601455 | 35.952428 | 125.846646 | 2.001642 |
| 2 | 44.362490 | 8.945435 | 46.902096 | 35.417055 | 129.220682 | 4.994195 |
| 3 | 48.318931 | 17.452121 | 48.000000 | 30.866809 | 128.980308 | -0.910941 |
| 4 | 45.701789 | 10.659859 | 42.577846 | 35.041929 | 130.178314 | -3.388910 |
y = part1['Class'] #Target Attribute
y.head()
0 Normal 1 Normal 2 Normal 3 Normal 4 Normal Name: Class, dtype: category Categories (3, object): ['Normal', 'Type_H', 'Type_S']
Perform normalisation or scaling if required.
Outlier Analysis
for c in X:
q1,q3=np.percentile(part1[c],25),np.percentile(part1[c],75)
IQR=q3-q1
Threshold=IQR*1.5
lower,upper=q1-Threshold,q3+Threshold
Outliers=[i for i in part1[c] if i < lower or i > upper]
print('Total Number of outliers in',c,'Before Imputing :',len(Outliers))
Total Number of outliers in P_incidence Before Imputing : 3 Total Number of outliers in P_tilt Before Imputing : 13 Total Number of outliers in L_angle Before Imputing : 1 Total Number of outliers in S_slope Before Imputing : 1 Total Number of outliers in P_radius Before Imputing : 11 Total Number of outliers in S_Degree Before Imputing : 10
for c in X:
#taking mean of a column without considering outliers
part1_include = part1.loc[(part1[c] >= lower) & (part1[c] <= upper)]
mean=part1_include[c].mean()
print('Mean of',c,'is without outliers',mean)
Mean of P_incidence is without outliers 59.902477042833894 Mean of P_tilt is without outliers 17.542821967970955 Mean of L_angle is without outliers 51.692057577896435 Mean of S_slope is without outliers 42.69986418265373 Mean of P_radius is without outliers 93.21369994787878 Mean of S_Degree is without outliers 22.199966587796656
for c in X:
#imputing outliers with mean
part1[c]=np.where(part1[c]>upper,mean,part1[c])
part1[c]=np.where(part1[c]<lower,mean,part1[c])
Outliers=[i for i in part1[c] if i < lower or i > upper]
print('Total Number of outliers in',c,'After Imputing :',len(Outliers))
Total Number of outliers in P_incidence After Imputing : 0 Total Number of outliers in P_tilt After Imputing : 0 Total Number of outliers in L_angle After Imputing : 0 Total Number of outliers in S_slope After Imputing : 0 Total Number of outliers in P_radius After Imputing : 0 Total Number of outliers in S_Degree After Imputing : 0
Scaling
from scipy.stats import zscore
X_Scaled=X.apply(zscore)
X_Scaled.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| P_incidence | 310.0 | 1.042177e-16 | 1.001617 | -1.996010 | -0.817398 | -0.104925 | 0.719464 | 4.029206 |
| P_tilt | 310.0 | 2.096889e-16 | 1.001617 | -2.411664 | -0.688114 | -0.118606 | 0.458116 | 3.191402 |
| L_angle | 310.0 | 1.980495e-16 | 1.001617 | -2.047652 | -0.806027 | -0.127862 | 0.597549 | 3.984615 |
| S_slope | 310.0 | 2.829278e-17 | 1.001617 | -2.207741 | -0.716842 | -0.040960 | 0.726941 | 5.855771 |
| P_radius | 310.0 | -5.071212e-16 | 1.001617 | -3.597963 | -0.542383 | 0.026138 | 0.567621 | 3.395818 |
| S_Degree | 310.0 | -1.277204e-16 | 1.001617 | -0.996172 | -0.658507 | -0.387450 | 0.399768 | 10.460350 |
We can see that all the columns have a standard deviation of around 1 .
Check for target balancing. Add your comments.
#Using label encoder to convert categorical variables to one-hot encoding (dummy variables)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
part1['Class'] = le.fit_transform(part1['Class'])
part1['Class'].value_counts()
2 150 0 100 1 60 Name: Class, dtype: int64
as we know : 2 = Type_S
0 = Type_H
1 = Normal
a=part1['Class'].value_counts().plot.pie(autopct='%1.1f%%')
a.set_title('Target Variable Pie Chart')
plt.show()
As we can see the Target Variable is imbalanced. Type_s / 2 has 48.4% of all the data. This can lead to the model not learning about less distributed classes. This gives poor performance in unseen data.
Perform train-test split.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
print(X_train.shape)
print(y_train.shape)
(217, 6) (217,)
print(X_test.shape)
print(y_test.shape)
(93, 6) (93,)
5. Model training, testing and tuning:
• Design and train a KNN classifier.
• Display the classification accuracies for train and test data.
• Display and explain the classification report in detail.
• Automate the task of finding best values of K for KNN.
• Apply all the possible tuning techniques to train the best model for the given data. Select the final best trained
model with your comments for selecting this model.
#importing necessary libraries for KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,precision_recall_fscore_support
Design and train a KNN classifier.
# instantiate learning model (k = 3)
knn = KNeighborsClassifier(n_neighbors = 3)
# fitting the model
knn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
# predict the response
y_pred = knn.predict(X_test)
Display the classification accuracies for train and test data.
print('Accuracy on Training data:',knn.score(X_train, y_train) )
print('Accuracy on Test data:',knn.score(X_test, y_test) )
Accuracy on Training data: 0.9216589861751152 Accuracy on Test data: 0.8064516129032258
Training Acuracy is 92% and Testing Accuracy is 80%. Performance is less in test data.
This is due to overfitting of data
Display and explain the classification report in detail.
confusionmatrix = confusion_matrix(y_test,y_pred)
part1_confusionmatrix = pd.DataFrame(confusionmatrix, index = [i for i in ["Normal","Type_H","Type_S"]],
columns = [i for i in ["Normal","Type_H","Type_S"]])
plt.figure(figsize = (7,5))
sns.heatmap(part1_confusionmatrix, annot=True ,fmt='g')
plt.show()
print("Classification Matrix:\n",classification_report(y_test,y_pred))
Classification Matrix:
precision recall f1-score support
Normal 0.76 0.57 0.65 28
Type_H 0.54 0.78 0.64 18
Type_S 0.98 0.96 0.97 47
accuracy 0.81 93
macro avg 0.76 0.77 0.75 93
weighted avg 0.83 0.81 0.81 93
Automate the task of finding best values of K for KNN.
train_score=[]
test_score=[]
for k in range(1,51):
KNN = KNeighborsClassifier(n_neighbors= k , metric = 'euclidean' )
KNN.fit(X_train, y_train)
train_score.append(KNN.score(X_train, y_train))
test_score.append(KNN.score(X_test, y_test))
plt.plot(range(1,51),train_score)
plt.show()
Training accuracy decreases as we increase the value of K.
plt.plot(range(1,51),test_score)
plt.show()
We can see that the maximum test accuracy occurs when k is less than 20. Thus we will fix k value less than 20
# creating odd list of K for KNN
myList = list(range(1,20))
# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))
# empty list that will hold accuracy scores
ac_scores = []
# perform accuracy metrics for values less than 20
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
# predict the response
y_pred = knn.predict(X_test)
# evaluate accuracy
scores = accuracy_score(y_test, y_pred)
ac_scores.append(scores)
# changing to misclassification error
MSE = [1 - x for x in ac_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)
The optimal number of neighbors is 13
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title('Plot of misclassification error vs k (with k value on X-axis) using matplotlib.')
plt.show()
Building Model with K = 13
# instantiate learning model (k = 13)
knn = KNeighborsClassifier(n_neighbors = 13)
# fitting the model
knn.fit(X_train, y_train)
# predict the response
y_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy on Training data:',knn.score(X_train, y_train) )
print('Accuracy on Test data:',knn.score(X_test, y_test) )
Accuracy on Training data: 0.8709677419354839 Accuracy on Test data: 0.8387096774193549
Apply all the possible tuning techniques to train the best model for the given data. Select the final best trained model with your comments for selecting this model.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn import model_selection
Logistic Regression
logreg = LogisticRegression(solver = 'liblinear', multi_class= 'auto')
logreg.fit(X_train, y_train)
lr_score = logreg.score(X_test, y_test)
print('For Logistic Regression:')
print('Accuracy on Training data',logreg.score(X_train, y_train) )
print('Accuracy on Test data: ',logreg.score(X_test, y_test) )
For Logistic Regression: Accuracy on Training data 0.8617511520737328 Accuracy on Test data: 0.7849462365591398
Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
print('For Naive Bayes:')
print('Accuracy on Training data',nb.score(X_train, y_train) )
print('Accuracy on Test data: ',nb.score(X_test, y_test) )
For Naive Bayes: Accuracy on Training data 0.8387096774193549 Accuracy on Test data: 0.8172043010752689
Support Vector Machines
# Building a Support Vector Machine on train data with linear kernel
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
print('For SVM with Linear kernel:')
print('Accuracy on Training data',svc_model.score(X_train, y_train) )
print('Accuracy on Test data: ',svc_model.score(X_test, y_test) )
For SVM with Linear kernel: Accuracy on Training data 0.8847926267281107 Accuracy on Test data: 0.8494623655913979
# Building a Support Vector Machine on train data with rbf kernel
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)
print('For SVM with rbf kernel:')
print('Accuracy on Training data',svc_model.score(X_train, y_train) )
print('Accuracy on Test data: ',svc_model.score(X_test, y_test) )
For SVM with rbf kernel: Accuracy on Training data 0.880184331797235 Accuracy on Test data: 0.8494623655913979
# Building a Support Vector Machine on train data with rbf kernel
svc_model = SVC(kernel='poly')
svc_model.fit(X_train, y_train)
print('For SVM with poly kernel:')
print('Accuracy on Training data',svc_model.score(X_train, y_train) )
print('Accuracy on Test data: ',svc_model.score(X_test, y_test) )
For SVM with poly kernel: Accuracy on Training data 0.8709677419354839 Accuracy on Test data: 0.8602150537634409
# Building a Support Vector Machine on train data with sigmoid kernel
svc_model = SVC(kernel='sigmoid')
svc_model.fit(X_train, y_train)
print('For SVM with rbf kernel:')
print('Accuracy on Training data',svc_model.score(X_train, y_train) )
print('Accuracy on Test data: ',svc_model.score(X_test, y_test))
For SVM with rbf kernel: Accuracy on Training data 0.47465437788018433 Accuracy on Test data: 0.5053763440860215
From the above results we can see that the SVM with poly kernel gives the most Accuracy on Test data ie 86%
6. Conclusion and improvisation:
Write your conclusion on the results.
From the results from various other tuning techniques we an see that the SVM with poly kernel gives the best training and test accuracies.It performs well for both test and training sets.
Also, SVC with sigmoid kernel gives the lowest results for both training and testing sets.
Detailed suggestions or improvements or on quality, quantity, variety, velocity, veracity etc. on the data points
collected by the research team to perform a better data analysis in future.
• CONTEXT: A bank X is on a massive digital transformation for all its departments. Bank has a growing customer base where majority of them are liability customers (depositors) vs borrowers (asset customers). The bank is interested in expanding the borrowers base rapidly to bring in more business via loan interests. A campaign that the bank ran in last quarter showed an
average single digit conversion rate. Digital transformation being the core strength of the business strategy, marketing
department wants to devise effective campaigns with better target marketing to increase the conversion ratio to double digit
with same budget as per last campaign.
• DATA DESCRIPTION: The data consists of the following attributes:
1. ID: Customer ID
2. Age Customer’s approximate age.
3. CustomerSince: Customer of the bank since. [unit is masked]
4. HighestSpend: Customer’s highest spend so far in one transaction. [unit is masked]
5. ZipCode: Customer’s zip code.
6. HiddenScore: A score associated to the customer which is masked by the bank as an IP.
7. MonthlyAverageSpend: Customer’s monthly average spend so far. [unit is masked]
8. Level: A level associated to the customer which is masked by the bank as an IP.
9. Mortgage: Customer’s mortgage. [unit is masked]
10. Security: Customer’s security asset with the bank. [unit is masked]
11. FixedDepositAccount: Customer’s fixed deposit account with the bank. [unit is masked]
12. InternetBanking: if the customer uses internet banking.
13. CreditCard: if the customer uses bank’s credit card.
14. LoanOnCard: if the customer has a loan on credit card.
1. Import and warehouse data:
• Import all the given datasets and explore shape and size of each.
• Merge all datasets onto one and explore final shape and size.
Import the datasets
part2a = pd.read_csv("Part2 - Data1.csv")
part2b = pd.read_csv("Part2 -Data2.csv")
Shape of Datasets
part2a.shape , part2b.shape
((5000, 8), (5000, 7))
print ("The first dataset has",'\033[1m', part2a.shape[0],"rows and", part2a.shape[1]," columns",'\033[0m')
print ("The second dataset has",'\033[1m', part2b.shape[0],"rows and", part2b.shape[1]," columns",'\033[0m')
The first dataset has 5000 rows and 8 columns The second dataset has 5000 rows and 7 columns
Size of dataset
part2a.size,part2b.size
(40000, 35000)
print ("The first dataset has",part2a.size,"elements")
print ("The second dataset has",part2b.size,"elements")
The first dataset has 40000 elements The second dataset has 35000 elements
Columns in the given separate datasets.
part2a.columns
Index(['ID', 'Age', 'CustomerSince', 'HighestSpend', 'ZipCode', 'HiddenScore',
'MonthlyAverageSpend', 'Level'],
dtype='object')
part2b.columns
Index(['ID', 'Mortgage', 'Security', 'FixedDepositAccount', 'InternetBanking',
'CreditCard', 'LoanOnCard'],
dtype='object')
Checking Sample Records for individual dataset
part2a.head()
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 |
part2b.head()
| ID | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 1 | 2 | 0 | 1 | 0 | 0 | 0 | NaN |
| 2 | 3 | 0 | 0 | 0 | 0 | 0 | NaN |
| 3 | 4 | 0 | 0 | 0 | 0 | 0 | NaN |
| 4 | 5 | 0 | 0 | 0 | 0 | 1 | NaN |
part2a.tail()
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | |
|---|---|---|---|---|---|---|---|---|
| 4995 | 4996 | 29 | 3 | 40 | 92697 | 1 | 1.9 | 3 |
| 4996 | 4997 | 30 | 4 | 15 | 92037 | 4 | 0.4 | 1 |
| 4997 | 4998 | 63 | 39 | 24 | 93023 | 2 | 0.3 | 3 |
| 4998 | 4999 | 65 | 40 | 49 | 90034 | 3 | 0.5 | 2 |
| 4999 | 5000 | 28 | 4 | 83 | 92612 | 3 | 0.8 | 1 |
part2b.tail()
| ID | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|
| 4995 | 4996 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4996 | 4997 | 85 | 0 | 0 | 1 | 0 | 0.0 |
| 4997 | 4998 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 4998 | 4999 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 4999 | 5000 | 0 | 0 | 0 | 1 | 1 | 0.0 |
We can see that most of the records for the second dataset are 0.
Merging the datasets into a combined dataset
part2=part2a.merge(part2b,left_on='ID',right_on='ID') #ID feild is common in both the datasets
part2.shape
(5000, 14)
print ("The first dataset has",'\033[1m', part2.shape[0],"rows and", part2.shape[1]," columns",'\033[0m')
The first dataset has 5000 rows and 14 columns
print ("The final dataset has",part2.size,"elements")
The final dataset has 70000 elements
part2.head()
| ID | Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 1 | 0 | 0 | 0 | NaN |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | NaN |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 1 | NaN |
part2.columns
Index(['ID', 'Age', 'CustomerSince', 'HighestSpend', 'ZipCode', 'HiddenScore',
'MonthlyAverageSpend', 'Level', 'Mortgage', 'Security',
'FixedDepositAccount', 'InternetBanking', 'CreditCard', 'LoanOnCard'],
dtype='object')
2. Data cleansing:
• Explore and if required correct the datatypes of each attribute
• Explore for null values in the attributes and if required drop or impute values.
Exploring the datatypes
part2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 CustomerSince 5000 non-null int64 3 HighestSpend 5000 non-null int64 4 ZipCode 5000 non-null int64 5 HiddenScore 5000 non-null int64 6 MonthlyAverageSpend 5000 non-null float64 7 Level 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Security 5000 non-null int64 10 FixedDepositAccount 5000 non-null int64 11 InternetBanking 5000 non-null int64 12 CreditCard 5000 non-null int64 13 LoanOnCard 4980 non-null float64 dtypes: float64(2), int64(12) memory usage: 585.9 KB
part2['Security'].value_counts()
0 4478 1 522 Name: Security, dtype: int64
part2['FixedDepositAccount'].value_counts()
0 4698 1 302 Name: FixedDepositAccount, dtype: int64
col=['HiddenScore','Level','Security','FixedDepositAccount', 'InternetBanking', 'CreditCard', 'LoanOnCard']
for i in col:
part2[i]=part2[i].astype('category')
part2['ZipCode'] = part2['ZipCode'].astype(str).str.zfill(5)
part2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 CustomerSince 5000 non-null int64 3 HighestSpend 5000 non-null int64 4 ZipCode 5000 non-null object 5 HiddenScore 5000 non-null category 6 MonthlyAverageSpend 5000 non-null float64 7 Level 5000 non-null category 8 Mortgage 5000 non-null int64 9 Security 5000 non-null category 10 FixedDepositAccount 5000 non-null category 11 InternetBanking 5000 non-null category 12 CreditCard 5000 non-null category 13 LoanOnCard 4980 non-null category dtypes: category(7), float64(1), int64(5), object(1) memory usage: 347.6+ KB
We can see that there are null values in the LoanOnCard column.
Explore for null values in the attributes and if required drop or impute values.
part2.isnull().sum()
ID 0 Age 0 CustomerSince 0 HighestSpend 0 ZipCode 0 HiddenScore 0 MonthlyAverageSpend 0 Level 0 Mortgage 0 Security 0 FixedDepositAccount 0 InternetBanking 0 CreditCard 0 LoanOnCard 20 dtype: int64
There are 20 null values in LoanOnCard column, since this is relatively a small number compared to the size of the entire dataset we can drop these null values.
part2.dropna(axis=0,inplace=True)
part2.isnull().sum()
ID 0 Age 0 CustomerSince 0 HighestSpend 0 ZipCode 0 HiddenScore 0 MonthlyAverageSpend 0 Level 0 Mortgage 0 Security 0 FixedDepositAccount 0 InternetBanking 0 CreditCard 0 LoanOnCard 0 dtype: int64
Also, the column ID has no further relevance in any model building hence we can drop this column.
part2.drop('ID',axis=1,inplace=True)
part2.head()
| Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | LoanOnCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 34 | 9 | 180 | 93023 | 1 | 8.9 | 3 | 0 | 0 | 0 | 0 | 0 | 1.0 |
| 10 | 65 | 39 | 105 | 94710 | 4 | 2.4 | 3 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 11 | 29 | 5 | 45 | 90277 | 3 | 0.1 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
| 12 | 48 | 23 | 114 | 93106 | 2 | 3.8 | 3 | 0 | 1 | 0 | 0 | 0 | 0.0 |
| 13 | 59 | 32 | 40 | 94920 | 4 | 2.5 | 2 | 0 | 0 | 0 | 1 | 0 | 0.0 |
3. Data analysis & visualisation:
• Perform detailed statistical analysis on the data.
• Perform a detailed univariate, bivariate and multivariate analysis with appropriate detailed comments after each analysis.
part2.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 4980.0 | 45.352610 | 11.464212 | 23.0 | 35.0 | 45.0 | 55.000 | 67.0 |
| CustomerSince | 4980.0 | 20.117671 | 11.468716 | -3.0 | 10.0 | 20.0 | 30.000 | 43.0 |
| HighestSpend | 4980.0 | 73.852410 | 46.070090 | 8.0 | 39.0 | 64.0 | 98.000 | 224.0 |
| MonthlyAverageSpend | 4980.0 | 1.939536 | 1.750006 | 0.0 | 0.7 | 1.5 | 2.525 | 10.0 |
| Mortgage | 4980.0 | 56.589759 | 101.836758 | 0.0 | 0.0 | 0.0 | 101.000 | 635.0 |
part2.describe(include = ("category",'object')).T
| count | unique | top | freq | |
|---|---|---|---|---|
| ZipCode | 4980 | 467 | 94720 | 167 |
| HiddenScore | 4980 | 4 | 1 | 1466 |
| Level | 4980 | 3 | 1 | 2089 |
| Security | 4980 | 2 | 0 | 4460 |
| FixedDepositAccount | 4980 | 2 | 0 | 4678 |
| InternetBanking | 4980 | 2 | 1 | 2974 |
| CreditCard | 4980 | 2 | 0 | 3514 |
| LoanOnCard | 4980.0 | 2.0 | 0.0 | 4500.0 |
Distribution and outlier analysis of numerical variables
coln = ['Age', 'CustomerSince', 'HighestSpend','MonthlyAverageSpend','Mortgage']
for i in coln:
f, axes = plt.subplots(1, 2, figsize=(15,5))
sns.distplot(part2[i], ax=axes[0],color = 'forestgreen')
sns.boxplot(x = i, data=part2, orient='h' , ax=axes[1],color = 'darkseagreen')
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(part2[i],25),np.percentile(part2[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[j for j in part2[i] if j < lower or j > upper]
b=len(Outliers)
print('Total Number of outliers in {} {}'.format(i,b))
Total Number of outliers in Age 0
Total Number of outliers in CustomerSince 0
Total Number of outliers in HighestSpend 96
Total Number of outliers in MonthlyAverageSpend 324
Total Number of outliers in Mortgage 291
Age
CustomerSince
HighestSpend
MonthlyAverageSpend
Mortgage
HiddenScore
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['HiddenScore'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('HiddenScore',data=part2,ax=axes[1],order=[1,2,4,3])
axes[0].set_title('Pie Chart of {}'.format('HiddenScore'))
axes[1].set_title('Bar Chart of {}'.format('HiddenScore'))
plt.show()
Level
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['Level'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Level',data=part2,ax=axes[1],order =[1,3,2])
axes[0].set_title('Pie Chart of {}'.format('Level'))
axes[1].set_title('Bar Chart of {}'.format('Level'))
plt.show()
Security
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['Security'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Security',data=part2,ax=axes[1],order =[0,1])
axes[0].set_title('Pie Chart of {}'.format('Security'))
axes[1].set_title('Bar Chart of {}'.format('Security'))
plt.show()
FixedDepositAccount
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['FixedDepositAccount'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('FixedDepositAccount',data=part2,ax=axes[1],order=[1,0])
axes[0].set_title('Pie Chart of {}'.format('FixedDepositAccount'))
axes[1].set_title('Bar Chart of {}'.format('FixedDepositAccount'))
plt.show()
InternetBanking
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['InternetBanking'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('InternetBanking',data=part2,ax=axes[1],order=[1,0])
axes[0].set_title('Pie Chart of {}'.format('InternetBanking'))
axes[1].set_title('Bar Chart of {}'.format('InternetBanking'))
plt.show()
CreditCard
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['CreditCard'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('CreditCard',data=part2,ax=axes[1],order=[0,1])
axes[0].set_title('Pie Chart of {}'.format('CreditCard'))
axes[1].set_title('Bar Chart of {}'.format('CreditCard'))
plt.show()
LoanOnCard (Traget Variable)
f,axes=plt.subplots(1,2,figsize=(15,5))
part2['LoanOnCard'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('LoanOnCard',data=part2,ax=axes[1],order=[0,1])
axes[0].set_title('Pie Chart of {}'.format('LoanOnCard'))
axes[1].set_title('Bar Chart of {}'.format('LoanOnCard'))
plt.show()
LoanOnCard vs CustomerSince
f, axes = plt.subplots(1, 2, figsize=(10,5))
plt.figure(figsize=(15,7))
sns.boxplot(x='LoanOnCard', y='CustomerSince', data = part2 ,ax=axes[0])
sns.pointplot(x='LoanOnCard', y='CustomerSince', data = part2,ax=axes[1])
plt.show()
<Figure size 1080x504 with 0 Axes>
LoanOnCard vs Age
f, axes = plt.subplots(1, 2, figsize=(10,5))
plt.figure(figsize=(15,7))
sns.boxplot(x='LoanOnCard', y='Age', data = part2 ,ax=axes[0])
sns.pointplot(x='LoanOnCard', y='Age', data = part2,ax=axes[1])
plt.show()
<Figure size 1080x504 with 0 Axes>
LoanOnCard vs HighestSpend
f, axes = plt.subplots(1, 2, figsize=(10,5))
plt.figure(figsize=(15,7))
sns.boxplot(x='LoanOnCard', y='HighestSpend', data = part2 ,ax=axes[0])
sns.pointplot(x='LoanOnCard', y='HighestSpend', data = part2,ax=axes[1])
plt.show()
<Figure size 1080x504 with 0 Axes>
MonthlyAverageSpend vs LoanOnCard
f, axes = plt.subplots(1, 2, figsize=(10,5))
plt.figure(figsize=(15,7))
sns.boxplot(x='LoanOnCard', y='MonthlyAverageSpend', data = part2 ,ax=axes[0])
sns.pointplot(x='LoanOnCard', y='MonthlyAverageSpend', data = part2,ax=axes[1])
plt.show()
<Figure size 1080x504 with 0 Axes>
LoanOnCard vs Mortgage
f, axes = plt.subplots(1, 2, figsize=(10,5))
plt.figure(figsize=(15,7))
sns.boxplot(x='LoanOnCard', y='Mortgage', data = part2 ,ax=axes[0])
sns.pointplot(x='LoanOnCard', y='Mortgage', data = part2,ax=axes[1])
plt.show()
<Figure size 1080x504 with 0 Axes>
MonthlyAverageSpend vs HighestSpend
plt.figure(figsize=(25,10))
sns.regplot(x='MonthlyAverageSpend',y='HighestSpend', data=part2)
<AxesSubplot:xlabel='MonthlyAverageSpend', ylabel='HighestSpend'>
We can see that there is clearly a linear relationship between these two variables.
Correlation Between Variable
part2.corr()
| Age | CustomerSince | HighestSpend | MonthlyAverageSpend | Mortgage | |
|---|---|---|---|---|---|
| Age | 1.000000 | 0.994208 | -0.054951 | -0.051896 | -0.013272 |
| CustomerSince | 0.994208 | 1.000000 | -0.046092 | -0.049918 | -0.011380 |
| HighestSpend | -0.054951 | -0.046092 | 1.000000 | 0.646109 | 0.207236 |
| MonthlyAverageSpend | -0.051896 | -0.049918 | 0.646109 | 1.000000 | 0.110275 |
| Mortgage | -0.013272 | -0.011380 | 0.207236 | 0.110275 | 1.000000 |
plt.figure(figsize=(10,5))
sns.heatmap(part2.corr(), annot=True, linewidths=.5, fmt= '.1f', center = 1 )
plt.show()
Hidden Score vs Loan on card
plt.figure(figsize=(15,7))
sns.countplot(part2['HiddenScore'],hue=part2['LoanOnCard'])
<AxesSubplot:xlabel='HiddenScore', ylabel='count'>
We can see that the count for LoanOnCard = 1 is high for all the HiddenScore levels, this is becuase of Class imbalance, we will have to solve this class imbalance problem beore buiding the model.
sns.pairplot(part2)
<seaborn.axisgrid.PairGrid at 0x25be366a880>
sns.pairplot(part2, hue='LoanOnCard')
<seaborn.axisgrid.PairGrid at 0x25be5c48f40>
4. Data pre-processing:
• Segregate predictors vs target attributes
• Check for target balancing and fix it if found imbalanced.
• Perform train-test split.
Segregating Predictor and Target attributes.
X= part2.drop(['LoanOnCard'], axis = 1) #Predictor attributes
X.head()
| Age | CustomerSince | HighestSpend | ZipCode | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 34 | 9 | 180 | 93023 | 1 | 8.9 | 3 | 0 | 0 | 0 | 0 | 0 |
| 10 | 65 | 39 | 105 | 94710 | 4 | 2.4 | 3 | 0 | 0 | 0 | 0 | 0 |
| 11 | 29 | 5 | 45 | 90277 | 3 | 0.1 | 2 | 0 | 0 | 0 | 1 | 0 |
| 12 | 48 | 23 | 114 | 93106 | 2 | 3.8 | 3 | 0 | 1 | 0 | 0 | 0 |
| 13 | 59 | 32 | 40 | 94920 | 4 | 2.5 | 2 | 0 | 0 | 0 | 1 | 0 |
y = part2['LoanOnCard'] #Target Attribute
y.head(10)
9 1.0 10 0.0 11 0.0 12 0.0 13 0.0 14 0.0 15 0.0 16 1.0 17 0.0 18 1.0 Name: LoanOnCard, dtype: category Categories (2, float64): [0.0, 1.0]
We dont need the ZipCode, and due to high Correlation in Age and CustomerSince, dropping these columns from X ie Predictor Variables
X= part2.drop(['LoanOnCard','ZipCode','CustomerSince'], axis = 1) #Predictor attributes
X.head()
| Age | HighestSpend | HiddenScore | MonthlyAverageSpend | Level | Mortgage | Security | FixedDepositAccount | InternetBanking | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 34 | 180 | 1 | 8.9 | 3 | 0 | 0 | 0 | 0 | 0 |
| 10 | 65 | 105 | 4 | 2.4 | 3 | 0 | 0 | 0 | 0 | 0 |
| 11 | 29 | 45 | 3 | 0.1 | 2 | 0 | 0 | 0 | 1 | 0 |
| 12 | 48 | 114 | 2 | 3.8 | 3 | 0 | 1 | 0 | 0 | 0 |
| 13 | 59 | 40 | 4 | 2.5 | 2 | 0 | 0 | 0 | 1 | 0 |
Checking if all other variables have an impact on LoanOncard using hypothesis testing.
cat_col=list(part2.select_dtypes(include=['category']).columns)
cat_col.remove('LoanOnCard')
for i in cat_col:
crosstab=pd.crosstab(part2['LoanOnCard'],part2[i])
chi,p_value,dof,expected=stats.chi2_contingency(crosstab)
if p_value < 0.05: # Setting our significance level at 5%
print('Rejecting Null Hypothesis. \n There is significant difference in {} Feature for different category of target variable(Loan on card)'.format(i))
else:
print('Fail to Reject Null Hypothesis.\n There is no significant difference in {} Feature for different category of target variable(Loan on card)'.format(i))
Rejecting Null Hypothesis. There is significant difference in HiddenScore Feature for different category of target variable(Loan on card) Rejecting Null Hypothesis. There is significant difference in Level Feature for different category of target variable(Loan on card) Fail to Reject Null Hypothesis. There is no significant difference in Security Feature for different category of target variable(Loan on card) Rejecting Null Hypothesis. There is significant difference in FixedDepositAccount Feature for different category of target variable(Loan on card) Fail to Reject Null Hypothesis. There is no significant difference in InternetBanking Feature for different category of target variable(Loan on card) Fail to Reject Null Hypothesis. There is no significant difference in CreditCard Feature for different category of target variable(Loan on card)
We can see Security,internet banking and CreditCard does not have significant difference in target variable.So dropping these column before building model.
X.drop(['CreditCard','InternetBanking','Security'],axis=1,inplace=True)
X.head()
| Age | HighestSpend | HiddenScore | MonthlyAverageSpend | Level | Mortgage | FixedDepositAccount | |
|---|---|---|---|---|---|---|---|
| 9 | 34 | 180 | 1 | 8.9 | 3 | 0 | 0 |
| 10 | 65 | 105 | 4 | 2.4 | 3 | 0 | 0 |
| 11 | 29 | 45 | 3 | 0.1 | 2 | 0 | 0 |
| 12 | 48 | 114 | 2 | 3.8 | 3 | 0 | 0 |
| 13 | 59 | 40 | 4 | 2.5 | 2 | 0 | 0 |
Outlier Analysis
col=['HighestSpend','MonthlyAverageSpend','Mortgage']
for c in col:
#getting upper lower quartile values
q25,q75=np.percentile(part2[c],25),np.percentile(part2[c],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[i for i in part2[c] if i < lower or i > upper]
print('{} Total Number of outliers in {} Before Imputing : {}'.format('\033[1m',c,len(Outliers)))
#taking mean of a column without considering outliers
part2_include = part2.loc[(part2[c] >= lower) & (part2[c] <= upper)]
mean=int(part2_include[c].mean())
print('{} Mean of {} is {}'.format('\033[1m',c,mean))
#imputing outliers with mean
part2[c]=np.where(part2[c]>upper,mean,part2[c])
part2[c]=np.where(part2[c]<lower,mean,part2[c])
Outliers=[i for i in part2[c] if i < lower or i > upper]
print('{} Total Number of outliers in {} After Imputing : {}'.format('\033[1m',c,len(Outliers)))
print('\n')
Total Number of outliers in HighestSpend Before Imputing : 96 Mean of HighestSpend is 71 Total Number of outliers in HighestSpend After Imputing : 0 Total Number of outliers in MonthlyAverageSpend Before Imputing : 324 Mean of MonthlyAverageSpend is 1 Total Number of outliers in MonthlyAverageSpend After Imputing : 0 Total Number of outliers in Mortgage Before Imputing : 291 Mean of Mortgage is 38 Total Number of outliers in Mortgage After Imputing : 0
Check for target balancing and fix it if found imbalanced.
f,axes=plt.subplots(1,2,figsize=(10,5))
part2['LoanOnCard'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('LoanOnCard',data=part2,ax=axes[1],order=[0,1])
axes[0].set_title('LoanOnCard Variable Pie Chart')
axes[1].set_title('LoanOnCard Variable Bar Graph')
plt.show()
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTE
X.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4980 entries, 9 to 4999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 4980 non-null int64 1 HighestSpend 4980 non-null int64 2 HiddenScore 4980 non-null category 3 MonthlyAverageSpend 4980 non-null float64 4 Level 4980 non-null category 5 Mortgage 4980 non-null int64 6 FixedDepositAccount 4980 non-null category dtypes: category(3), float64(1), int64(3) memory usage: 338.6 KB
smote_nc=SMOTENC(categorical_features=[2,4,6],random_state=42) #specifying categorical column numbers
x_s,y_s=smote_nc.fit_resample(X,y)
print('Before sampling:')
print(y.value_counts())
Before sampling: 0.0 4500 1.0 480 Name: LoanOnCard, dtype: int64
print('After sampling:')
print(y_s.value_counts())
After sampling: 0.0 4500 1.0 4500 Name: LoanOnCard, dtype: int64
Perform train-test split.
For Balanced Data
X_train, X_test, y_train, y_test = train_test_split(x_s, y_s, test_size=0.30, random_state=10)
X_train.shape, X_test.shape
((6300, 7), (2700, 7))
y_train.shape, y_test.shape
((6300,), (2700,))
For imbalanced Data.
# Split X and y into training and test set in 70:30 ratio
X_traini, X_testi, y_traini, y_testi = train_test_split(X, y, test_size=0.30, random_state=10)
X_traini.shape, X_testi.shape
((3486, 7), (1494, 7))
y_traini.shape, y_testi.shape
((3486,), (1494,))
5. Model training, testing and tuning:
• Design and train a Logistic regression and Naive Bayes classifiers.
• Display the classification accuracies for train and test data.
• Display and explain the classification report in detail.
• Apply all the possible tuning techniques to train the best model for the given data. Select the final best trained model with your comments for selecting this model.
from sklearn.linear_model import LogisticRegression
For SMOTE balanced data
Design and train a Logistic regression classifiers.
logit = LogisticRegression()
logit.fit(X_train, y_train)
logit_pred = logit.predict(X_test)
Display the classification accuracies for train and test data.
print('Accuracy on Training data:',logit.score(X_train, y_train) )
print('Accuracy on Test data:',logit.score(X_test, y_test) )
Accuracy on Training data: 0.9022222222222223 Accuracy on Test data: 0.9022222222222223
cm = confusion_matrix(y_test, logit_pred, labels=[0, 1])
part_cm = pd.DataFrame(cm, index = [i for i in ["Non-Loan holders","Loan holders"]],
columns = [i for i in ["Non-Loan holders","Loan holders"]])
plt.figure(figsize = (7,5))
sns.heatmap(part_cm, annot=True ,fmt='g')
plt.show()
Display and explain the classification report in detail.
print("classification Matrix:\n",classification_report(y_test,logit_pred))
classification Matrix:
precision recall f1-score support
0.0 0.92 0.89 0.90 1349
1.0 0.89 0.92 0.90 1351
accuracy 0.90 2700
macro avg 0.90 0.90 0.90 2700
weighted avg 0.90 0.90 0.90 2700
For Imbalanced Data
Design and train a Logistic regression classifiers.
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(X_traini, y_traini)
#predict on test
y_predict = model.predict(X_testi)
Display the classification accuracies for train and test data.
print('Accuracy on Training data:',model.score(X_traini, y_traini) )
print('Accuracy on Test data:',model.score(X_testi, y_testi) )
Accuracy on Training data: 0.942627653471027 Accuracy on Test data: 0.9504685408299867
from sklearn import metrics
cm=metrics.confusion_matrix(y_testi, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["Non-Loan holders","Loan holders"]],
columns = [i for i in ["Non-Loan holders","Loan holders"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True,fmt='g')
<AxesSubplot:>
Display and explain the classification report in detail.
print("classification Matrix:\n",classification_report(y_testi,y_predict))
classification Matrix:
precision recall f1-score support
0.0 0.96 0.99 0.97 1347
1.0 0.84 0.61 0.71 147
accuracy 0.95 1494
macro avg 0.90 0.80 0.84 1494
weighted avg 0.95 0.95 0.95 1494
For SMOTE balanced data
Design and train a Naive Bayes classifiers.
g_model = GaussianNB()
g_model.fit(X_train, y_train.ravel())
g_pred = g_model.predict(X_test)
Display the classification accuracies for train and test data.
print('Accuracy on Training data:',g_model.score(X_train, y_train) )
print('Accuracy on Test data:',g_model.score(X_test, y_test) )
Accuracy on Training data: 0.846031746031746 Accuracy on Test data: 0.8403703703703703
cm = confusion_matrix(y_test, g_pred, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Non-Loan holders","Loan holders"]],
columns = [i for i in ["Non-Loan holders","Loan holders"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
plt.show()
Display and explain the classification report in detail.
print("classification Matrix:\n",classification_report(y_test,g_pred))
classification Matrix:
precision recall f1-score support
0.0 0.82 0.87 0.85 1349
1.0 0.87 0.81 0.83 1351
accuracy 0.84 2700
macro avg 0.84 0.84 0.84 2700
weighted avg 0.84 0.84 0.84 2700
For Imbalanced Data
Design and train a Naive Bayes classifiers.
g_model = GaussianNB()
g_model.fit(X_traini, y_traini.ravel())
g_predi = g_model.predict(X_testi)
Display the classification accuracies for train and test data.
print('Accuracy on Training data:',g_model.score(X_traini, y_traini) )
print('Accuracy on Test data:',g_model.score(X_testi, y_testi) )
Accuracy on Training data: 0.8875502008032129 Accuracy on Test data: 0.8775100401606426
cm = confusion_matrix(y_testi, g_predi, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Non-Loan holders","Loan holders"]],
columns = [i for i in ["Non-Loan holders","Loan holders"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
plt.show()
Display and explain the classification report in detail.
print("classification Matrix:\n",classification_report(y_testi,g_predi))
classification Matrix:
precision recall f1-score support
0.0 0.96 0.91 0.93 1347
1.0 0.42 0.62 0.50 147
accuracy 0.88 1494
macro avg 0.69 0.76 0.71 1494
weighted avg 0.90 0.88 0.89 1494
Apply all the possible tuning techniques to train the best model for the given data. Select the final best trained model with your comments for selecting this model.
K-Fold CV for finding best model
LR_model=LogisticRegression()
KNN_model=KNeighborsClassifier(n_neighbors=13)
GN_model=GaussianNB()
For imbalanced Data
# prepare models
models = []
models.append(('LR', LR_model))
models.append(('KNN', KNN_model))
models.append(('NB', GN_model))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10)
cv_results = model_selection.cross_val_score(model,X,y,cv=kfold,scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
LR: 0.941365 (0.010152) KNN: 0.913454 (0.008154) NB: 0.883735 (0.012682)
For balanced Data
models = []
models.append(('LR', LR_model))
models.append(('KNN', KNN_model))
models.append(('NB', GN_model))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.StratifiedKFold(n_splits=10)
cv_results = model_selection.cross_val_score(model, X,y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
LR: 0.938153 (0.008318) KNN: 0.912450 (0.007082) NB: 0.883936 (0.013494)
6. Conclusion and improvisation:
Write your conclusion on the results